Setup

Install the correct packages and load the libraries in. If you have not installed the tidyverse package, make sure you uncomment the below lines.

#install.packages("tidyverse") 
#install.packages("lubridate")
library(tidyverse)
library(lubridate)

#load the data
#sn <- read_csv("~/Desktop/Code/pa_philadelphia_2019_02_25.csv")
sn <- read_csv("https://datajournalism.tech/pa_philadelphia_2019_02_25.csv")

Data Analysis

Explore the dataset provided by Stanford University. See more on their website https://openpolicing.stanford.edu.

View(sn) #to view the data table
str(sn)  #to see the characteristics of variables
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 1891916 obs. of  19 variables:
##  $ raw_row_number  : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ date            : Date, format: "2014-09-27" "2014-09-27" ...
##  $ time            : 'hms' num  20:05:00 19:27:00 19:27:00 20:38:00 ...
##   ..- attr(*, "units")= chr "secs"
##  $ location        : chr  "5600 BLOCK BROOMALL ST" "5200 BLOCK PENTRIDGE ST" "5200 BLOCK PENTRIDGE ST" "4900 BLOCK KINGSESSING AV" ...
##  $ lat             : num  39.9 39.9 39.9 39.9 40 ...
##  $ lng             : num  -75.2 -75.2 -75.2 -75.2 -75.2 ...
##  $ district        : chr  "12" "12" "12" "12" ...
##  $ service_area    : chr  "124" "124" "124" "123" ...
##  $ subject_age     : num  24 58 31 29 35 39 49 61 43 48 ...
##  $ subject_race    : chr  "black" "black" "black" "black" ...
##  $ subject_sex     : chr  "male" "male" "male" "male" ...
##  $ type            : chr  "pedestrian" "pedestrian" "pedestrian" "vehicular" ...
##  $ arrest_made     : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ outcome         : chr  NA NA NA NA ...
##  $ contraband_found: logi  NA NA NA NA NA NA ...
##  $ frisk_performed : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ search_conducted: logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ search_person   : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ search_vehicle  : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   raw_row_number = col_double(),
##   ..   date = col_date(format = ""),
##   ..   time = col_time(format = ""),
##   ..   location = col_character(),
##   ..   lat = col_double(),
##   ..   lng = col_double(),
##   ..   district = col_character(),
##   ..   service_area = col_character(),
##   ..   subject_age = col_double(),
##   ..   subject_race = col_character(),
##   ..   subject_sex = col_character(),
##   ..   type = col_character(),
##   ..   arrest_made = col_logical(),
##   ..   outcome = col_character(),
##   ..   contraband_found = col_logical(),
##   ..   frisk_performed = col_logical(),
##   ..   search_conducted = col_logical(),
##   ..   search_person = col_logical(),
##   ..   search_vehicle = col_logical()
##   .. )
glimpse(sn) #to see a short summary of values in each column
## Observations: 1,891,916
## Variables: 19
## $ raw_row_number   <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, …
## $ date             <date> 2014-09-27, 2014-09-27, 2014-09-27, 2014-09-27…
## $ time             <time> 20:05:00, 19:27:00, 19:27:00, 20:38:00, 03:00:…
## $ location         <chr> "5600 BLOCK BROOMALL ST", "5200 BLOCK PENTRIDGE…
## $ lat              <dbl> 39.94374, 39.94522, 39.94522, 39.94304, 39.9502…
## $ lng              <dbl> -75.23279, -75.22476, -75.22476, -75.21521, -75…
## $ district         <chr> "12", "12", "12", "12", "09", "09", "09", "09",…
## $ service_area     <chr> "124", "124", "124", "123", "092", "091", "092"…
## $ subject_age      <dbl> 24, 58, 31, 29, 35, 39, 49, 61, 43, 48, 62, 38,…
## $ subject_race     <chr> "black", "black", "black", "black", "black", "b…
## $ subject_sex      <chr> "male", "male", "male", "male", "male", "female…
## $ type             <chr> "pedestrian", "pedestrian", "pedestrian", "vehi…
## $ arrest_made      <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE…
## $ outcome          <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ contraband_found <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ frisk_performed  <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE…
## $ search_conducted <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE…
## $ search_person    <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE…
## $ search_vehicle   <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE…
colnames(sn) #to view column headers
##  [1] "raw_row_number"   "date"             "time"            
##  [4] "location"         "lat"              "lng"             
##  [7] "district"         "service_area"     "subject_age"     
## [10] "subject_race"     "subject_sex"      "type"            
## [13] "arrest_made"      "outcome"          "contraband_found"
## [16] "frisk_performed"  "search_conducted" "search_person"   
## [19] "search_vehicle"
sn <- sn %>% mutate(subject_sex=as.factor(subject_sex)) #convert chr to factor levels for the variable `subject_sex`
sn <- sn %>% mutate(subject_race=as.factor(subject_race)) #convert chr to factor levels for the variable `subject_race`

After viewing the dataset, you can analyze it to see the min, max, mean, median and other values for each variable. These are called descriptive statistics.

summary(sn)
##  raw_row_number         date                time         
##  Min.   :      1   Min.   :2014-01-01   Length:1891916   
##  1st Qu.: 472980   1st Qu.:2015-02-06   Class1:hms       
##  Median : 945958   Median :2016-01-04   Class2:difftime  
##  Mean   : 945958   Mean   :2016-02-01   Mode  :numeric   
##  3rd Qu.:1418937   3rd Qu.:2017-02-07                    
##  Max.   :1891916   Max.   :2018-04-14                    
##                                                          
##    location              lat              lng           district        
##  Length:1891916     Min.   :39.88    Min.   :-75.28   Length:1891916    
##  Class :character   1st Qu.:39.96    1st Qu.:-75.20   Class :character  
##  Mode  :character   Median :39.99    Median :-75.16   Mode  :character  
##                     Mean   :39.99    Mean   :-75.16                     
##                     3rd Qu.:40.02    3rd Qu.:-75.13                     
##                     Max.   :40.14    Max.   :-74.96                     
##                     NA's   :106768   NA's   :106768                     
##  service_area        subject_age                     subject_race    
##  Length:1891916     Min.   : 10.00   asian/pacific islander:  40531  
##  Class :character   1st Qu.: 24.00   black                 :1265616  
##  Mode  :character   Median : 31.00   hispanic              : 185503  
##                     Mean   : 34.69   other/unknown         :  20696  
##                     3rd Qu.: 44.00   white                 : 379570  
##                     Max.   :110.00                                   
##                     NA's   :4612                                     
##  subject_sex          type           arrest_made       outcome         
##  female: 470421   Length:1891916     Mode :logical   Length:1891916    
##  male  :1420807   Class :character   FALSE:1795709   Class :character  
##  NA's  :    688   Mode  :character   TRUE :96207     Mode  :character  
##                                                                        
##                                                                        
##                                                                        
##                                                                        
##  contraband_found frisk_performed search_conducted search_person  
##  Mode :logical    Mode :logical   Mode :logical    Mode :logical  
##  FALSE:84702      FALSE:1721352   FALSE:1773689    FALSE:1794287  
##  TRUE :33525      TRUE :170564    TRUE :118227     TRUE :97629    
##  NA's :1773689                                                    
##                                                                   
##                                                                   
##                                                                   
##  search_vehicle 
##  Mode :logical  
##  FALSE:1854385  
##  TRUE :37531    
##                 
##                 
##                 
## 

There are some verbs that you need to memorize. See more at https://learn.r-journalism.com/en/wrangling/dplyr/dplyr/ First, the select verb helps you grab column(s) in a dataset

a <- select(sn,subject_race, subject_age, subject_sex, type, contraband_found, arrest_made) #to select the subject_race columns and assign it to object a.
#a <- sn %>% select(subject_race) #a different way to do the selection using %>% 

b <- filter(a, subject_age==10)

b <- group_by(b,arrest_made) %>% summarize(value=n())

Second, the group_by verb helps you categorize your values into fewer groups. The summarize verb always goes along with the group_by to help count the number of values for each group and compute the percentage of each group over the whole population.

sex <- sn %>% group_by(subject_sex) %>% 
  summarize(value=n(), prop = value/nrow(.))
## Warning: Factor `subject_sex` contains implicit NA, consider using
## `forcats::fct_explicit_na`
# to count the numbers of stopped drivers for each gender and compute the percentage of column and assign it to a table called `sex`
View(sex) # to view the `sex` table

race <- sn %>% group_by(subject_race) %>% 
  summarize(value=n(), prop=value/nrow(.)) %>% 
  arrange(-value) # to count the numbers of stopped drivers for each race and compute the percentage of column and assign it to a table called `race`. Also, arrange the value in the descending order
View(race) # to view the `race` table

#Now it's your turn. Tell me how many stopped drivers were found with contraband? Or how many searches ended successfully? Write your code in the next line, without the hashtag.

Data Visualization

We will need certain packages to be installed and called before creating our charts.

library(ggplot2)
#install.packages("devtools")
#devtools::install_github('bbc/bbplot')
library(bbplot)

Line Chart

To create this chart, we will use the ggplot2 package. Let’s create a simple line chart by modifying the code on the Stanford University’s Open Policing Project website. This chart displays the total numbers of drivers who got stopped by the police over the year.

line <- sn %>% 
  count(year = year(date), subject_race) %>% 
  ggplot(aes(x = year, y = n, color = subject_race)) +
  geom_point() +
  geom_line() +
  bbc_style()

line

Bar Chart

#data prep
barprep <- sn %>% group_by(subject_race) %>% 
  summarize(value=n())

#make the plot

bar <- ggplot(barprep,
       aes(x=reorder(subject_race,value), y=value))+
  geom_bar(stat="identity", 
           position="identity", 
           fill="red")+
  geom_hline(yintercept = 0) +
  bbc_style()+
  labs(title="Stopped Drivers by Race",
       subtitle = "African American drivers got stopped the most in the city of Philadelphia,Pensylvania")+
  coord_flip()

options(scipen=10000)

bar

You can export the graphics by running the names of the objects in the Console and click the Export button under the Plots tab. Otherwise, you can use one of the following codes to export it:

ggsave("bar.png", width=40, height=20, units= "cm")
ggsave("bar.svg", width=40, height=20, units= "cm") 
#ggsave only saves the last plot you created, so you may want to go up to the line chart and write ggsave("line.png")

Pie Chart

This will be made with the ggplot2 package.

ggplot(barprep, aes(x="", y=value, fill=subject_race))+
  geom_bar(width=1, stat="identity")+
  coord_polar("y", start=0)

The following pie will be made with the plotly package.

#install.packages("plotly")
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
pieprep <- sn %>% group_by(subject_race) %>% 
  summarize(value=n()) %>% 
  arrange(-value)

color <- c("gray", "pink", "red", "yellow", "green")

pie <- plot_ly(data=pieprep, labels=~subject_race, values=~value, type="pie", textposition="inside",insidetextfont=list(color="white"),hoverinfo="text", text=~paste(subject_race), marker=list(colors=color, line=list(color="white", width=1)), showlegend=F)  %>% layout(title="Stopped Driver by Race")

pie

Now we will make a donut. How to?

plot_ly(data=pieprep, labels=~subject_race, values=~value, showlegend=F) %>% 
  add_pie(hole=.5) %>% 
  layout(title="Stopped Drivers by Race")
#The second way of creating a donut
donut <- sn %>% group_by(subject_race) %>% 
  summarise(value=n()) %>% 
plot_ly(labels=~subject_race, values=~value, showlegend=F) %>% 
  add_pie(hole=.5) %>% 
  layout(title="Stopped Drivers by Race")

donut

Map with ggplot2

#list of packages we need
#install.packages("ggplot2")
#install.packages("ggmap")
#install.packages("maps")
#install.packages("mapdata")

#Call out the packages
library(ggplot2)
library(ggmap)
## Google's Terms of Service: https://cloud.google.com/maps-platform/terms/.
## Please cite ggmap if you use it! See citation("ggmap") for details.
## 
## Attaching package: 'ggmap'
## The following object is masked from 'package:plotly':
## 
##     wind
library(maps)
## 
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
## 
##     map
library(mapdata)

#get data for the base map
usa <- map_data("usa")
states <- map_data("state")
counties <- map_data("county")
world <- map_data("world2Hires")

#filter the year 2017
f <- filter(sn, year(date)==2017)

#filter Philadelphia, PA
philly <- filter(counties, region=="pennsylvania", subregion=="philadelphia")

#create the map

ggplot(philly) + geom_polygon(aes(x=long, y=lat, group=group))+
  coord_fixed(1.3)+
  geom_point(data=f, aes(x=lng, y=lat, color=subject_race), size=1)+ facet_wrap(~subject_race)
## Warning: Removed 8475 rows containing missing values (geom_point).

#install.packages("httpuv")
#install.packages("leaflet")
library(httpuv)
library(leaflet)
m <- leaflet() %>% 
  addTiles() %>% 
  setView(lng= -75.172347, lat= 39.952150, zoom=16) %>% 
  addMarkers(lng= -75.172347, lat= 39.952150, popup="Philadelphia, PA")

m
race <- colorFactor(c("pink", "black", "yellow", "red", "blue"), domain=c("white", "black", "asian/pacific islander", "hispanic", "other/unknown"), ordered=TRUE)
m2<- leaflet(f) %>%
  addProviderTiles(providers$CartoDB) %>% 
  setView(lng= -75.172347, lat= 39.952150, zoom=16) %>% 
  addCircleMarkers(~lng, ~lat, popup=paste("This is a", f$subject_race, "and", f$subject_sex, "driver."), weight= 3, radius=4, color=~race(subject_race), stroke=F, fillOpacity=.5)
## Warning in validateCoords(lng, lat, funcName): Data contains 8475 rows with
## either missing or invalid lat/lon values and will be ignored
m2